import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
#Configurações matplotlib e seaborn
plt.rcParams['figure.figsize'] = (16,10)
plt.style.use('seaborn-darkgrid')
import warnings
warnings.filterwarnings('ignore')
books = pd.read_csv("Dados_Recomendacao - Livro/Books.csv")
ratings = pd.read_csv("Dados_Recomendacao - Livro/Ratings.csv")
users = pd.read_csv("Dados_Recomendacao - Livro/Users.csv")
# Dimensão dos dados
books.shape, ratings.shape, users.shape
((271360, 8), (1149780, 3), (278858, 3))
books.head(3)
| ISBN | Book-Title | Book-Author | Year-Of-Publication | Publisher | Image-URL-S | Image-URL-M | Image-URL-L | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0195153448 | Classical Mythology | Mark P. O. Morford | 2002 | Oxford University Press | http://images.amazon.com/images/P/0195153448.0... | http://images.amazon.com/images/P/0195153448.0... | http://images.amazon.com/images/P/0195153448.0... |
| 1 | 0002005018 | Clara Callan | Richard Bruce Wright | 2001 | HarperFlamingo Canada | http://images.amazon.com/images/P/0002005018.0... | http://images.amazon.com/images/P/0002005018.0... | http://images.amazon.com/images/P/0002005018.0... |
| 2 | 0060973129 | Decision in Normandy | Carlo D'Este | 1991 | HarperPerennial | http://images.amazon.com/images/P/0060973129.0... | http://images.amazon.com/images/P/0060973129.0... | http://images.amazon.com/images/P/0060973129.0... |
books.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 271360 entries, 0 to 271359 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ISBN 271360 non-null object 1 Book-Title 271360 non-null object 2 Book-Author 271359 non-null object 3 Year-Of-Publication 271360 non-null object 4 Publisher 271358 non-null object 5 Image-URL-S 271360 non-null object 6 Image-URL-M 271360 non-null object 7 Image-URL-L 271357 non-null object dtypes: object(8) memory usage: 16.6+ MB
ratings.head(3)
| User-ID | ISBN | Book-Rating | |
|---|---|---|---|
| 0 | 276725 | 034545104X | 0 |
| 1 | 276726 | 0155061224 | 5 |
| 2 | 276727 | 0446520802 | 0 |
ratings.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1149780 entries, 0 to 1149779 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 User-ID 1149780 non-null int64 1 ISBN 1149780 non-null object 2 Book-Rating 1149780 non-null int64 dtypes: int64(2), object(1) memory usage: 26.3+ MB
users.head(3)
| User-ID | Location | Age | |
|---|---|---|---|
| 0 | 1 | nyc, new york, usa | NaN |
| 1 | 2 | stockton, california, usa | 18.0 |
| 2 | 3 | moscow, yukon territory, russia | NaN |
users.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 278858 entries, 0 to 278857 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 User-ID 278858 non-null int64 1 Location 278858 non-null object 2 Age 168096 non-null float64 dtypes: float64(1), int64(1), object(1) memory usage: 6.4+ MB
# Cruzamento dos dados
dados_cruzados = pd.merge(ratings,users,'inner','User-ID')
dados_cruzados = pd.merge(dados_cruzados,books,'inner','ISBN')
dados_cruzados.head(3)
| User-ID | ISBN | Book-Rating | Location | Age | Book-Title | Book-Author | Year-Of-Publication | Publisher | Image-URL-S | Image-URL-M | Image-URL-L | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 276725 | 034545104X | 0 | tyler, texas, usa | NaN | Flesh Tones: A Novel | M. J. Rose | 2002 | Ballantine Books | http://images.amazon.com/images/P/034545104X.0... | http://images.amazon.com/images/P/034545104X.0... | http://images.amazon.com/images/P/034545104X.0... |
| 1 | 2313 | 034545104X | 5 | cincinnati, ohio, usa | 23.0 | Flesh Tones: A Novel | M. J. Rose | 2002 | Ballantine Books | http://images.amazon.com/images/P/034545104X.0... | http://images.amazon.com/images/P/034545104X.0... | http://images.amazon.com/images/P/034545104X.0... |
| 2 | 6543 | 034545104X | 0 | strafford, missouri, usa | 34.0 | Flesh Tones: A Novel | M. J. Rose | 2002 | Ballantine Books | http://images.amazon.com/images/P/034545104X.0... | http://images.amazon.com/images/P/034545104X.0... | http://images.amazon.com/images/P/034545104X.0... |
dados_cruzados.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1031136 entries, 0 to 1031135 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 User-ID 1031136 non-null int64 1 ISBN 1031136 non-null object 2 Book-Rating 1031136 non-null int64 3 Location 1031136 non-null object 4 Age 753301 non-null float64 5 Book-Title 1031136 non-null object 6 Book-Author 1031135 non-null object 7 Year-Of-Publication 1031136 non-null object 8 Publisher 1031134 non-null object 9 Image-URL-S 1031136 non-null object 10 Image-URL-M 1031136 non-null object 11 Image-URL-L 1031132 non-null object dtypes: float64(1), int64(2), object(9) memory usage: 102.3+ MB
dados_cruzados['Year-Of-Publication'][dados_cruzados['Year-Of-Publication'] =='DK Publishing Inc']
911154 DK Publishing Inc 949657 DK Publishing Inc 949658 DK Publishing Inc Name: Year-Of-Publication, dtype: object
dados_cruzados.iloc[911154,7] = ""
dados_cruzados.iloc[949657,7] = ""
dados_cruzados.iloc[949658,7] = ""
dados_cruzados.iloc[918145,7] = ""
# Convertendo coluna ano em dados numéticos
dados_cruzados['Year-Of-Publication'] = pd.to_numeric(dados_cruzados['Year-Of-Publication'])
# Extraindo país da coluna "Location"
dados_cruzados['Location'] = dados_cruzados['Location'].apply(lambda x: x.split(", ")[-1].upper() )
dados_cruzados.describe()
| User-ID | Book-Rating | Age | Year-Of-Publication | |
|---|---|---|---|---|
| count | 1.031136e+06 | 1.031136e+06 | 753301.000000 | 1.031132e+06 |
| mean | 1.405945e+05 | 2.839051e+00 | 37.397648 | 1.968195e+03 |
| std | 8.052466e+04 | 3.854157e+00 | 14.098254 | 2.311015e+02 |
| min | 2.000000e+00 | 0.000000e+00 | 0.000000 | 0.000000e+00 |
| 25% | 7.041500e+04 | 0.000000e+00 | 28.000000 | 1.992000e+03 |
| 50% | 1.412100e+05 | 0.000000e+00 | 35.000000 | 1.997000e+03 |
| 75% | 2.114260e+05 | 7.000000e+00 | 45.000000 | 2.001000e+03 |
| max | 2.788540e+05 | 1.000000e+01 | 244.000000 | 2.050000e+03 |
# Remover avaliaç~es zeradas
dados_cruzados = dados_cruzados[dados_cruzados['Book-Rating'] > 0]
dados_cruzados.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 383842 entries, 1 to 1031135 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 User-ID 383842 non-null int64 1 ISBN 383842 non-null object 2 Book-Rating 383842 non-null int64 3 Location 383842 non-null object 4 Age 269621 non-null float64 5 Book-Title 383842 non-null object 6 Book-Author 383841 non-null object 7 Year-Of-Publication 383841 non-null float64 8 Publisher 383840 non-null object 9 Image-URL-S 383842 non-null object 10 Image-URL-M 383842 non-null object 11 Image-URL-L 383841 non-null object dtypes: float64(2), int64(2), object(8) memory usage: 38.1+ MB
dados_cruzados.describe()
| User-ID | Book-Rating | Age | Year-Of-Publication | |
|---|---|---|---|---|
| count | 383842.000000 | 383842.000000 | 269621.000000 | 383841.000000 |
| mean | 136031.461260 | 7.626701 | 36.835829 | 1965.636678 |
| std | 80482.299401 | 1.841339 | 13.753045 | 243.221296 |
| min | 8.000000 | 1.000000 | 0.000000 | 0.000000 |
| 25% | 67591.000000 | 7.000000 | 28.000000 | 1992.000000 |
| 50% | 133789.000000 | 8.000000 | 35.000000 | 1997.000000 |
| 75% | 206219.000000 | 9.000000 | 45.000000 | 2001.000000 |
| max | 278854.000000 | 10.000000 | 244.000000 | 2050.000000 |
plt.figure(figsize=(7, 4))
plt.title("Análise avaliações")
sns.boxplot(x = 'Book-Rating', data = dados_cruzados)
<AxesSubplot:title={'center':'Análise avaliações'}, xlabel='Book-Rating'>
# Analise
analise = dados_cruzados.groupby('Book-Title').agg(
Quantidade = ('Book-Title', 'count'),
Média = ('Book-Rating','mean'),
Max = ('Book-Rating','max'),
Min = ('Book-Rating','min'),
Mediana = ('Book-Rating','median')
)
analise.head()
| Quantidade | Média | Max | Min | Mediana | |
|---|---|---|---|---|---|
| Book-Title | |||||
| A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America) | 1 | 9.000000 | 9 | 9 | 9.0 |
| Ask Lily (Young Women of Faith: Lily Series, Book 5) | 1 | 8.000000 | 8 | 8 | 8.0 |
| Dark Justice | 1 | 10.000000 | 10 | 10 | 10.0 |
| Earth Prayers From around the World: 365 Prayers, Poems, and Invocations for Honoring the Earth | 7 | 7.142857 | 10 | 1 | 7.0 |
| Final Fantasy Anthology: Official Strategy Guide (Brady Games) | 2 | 10.000000 | 10 | 10 | 10.0 |
analise.sort_values("Quantidade", ascending = False).head(10)
| Quantidade | Média | Max | Min | Mediana | |
|---|---|---|---|---|---|
| Book-Title | |||||
| The Lovely Bones: A Novel | 707 | 8.185290 | 10 | 1 | 8.0 |
| Wild Animus | 581 | 4.390706 | 10 | 1 | 4.0 |
| The Da Vinci Code | 494 | 8.439271 | 10 | 1 | 9.0 |
| The Secret Life of Bees | 406 | 8.477833 | 10 | 2 | 9.0 |
| The Nanny Diaries: A Novel | 393 | 7.437659 | 10 | 1 | 8.0 |
| The Red Tent (Bestselling Backlist) | 383 | 8.182768 | 10 | 2 | 9.0 |
| Bridget Jones's Diary | 377 | 7.625995 | 10 | 1 | 8.0 |
| A Painted House | 366 | 7.398907 | 10 | 1 | 8.0 |
| Life of Pi | 336 | 8.080357 | 10 | 1 | 8.0 |
| Harry Potter and the Chamber of Secrets (Book 2) | 326 | 8.840491 | 10 | 4 | 9.0 |
# Analise qtd x avaliação
px.scatter(data_frame = analise, x = 'Quantidade', y= 'Média', title = 'Méxia x Quantidade')
analise['Quantidade'].describe()
# Verifica-se que a maioria dos dados possuem poucas avaliações
count 135567.000000 mean 2.831382 std 9.135691 min 1.000000 25% 1.000000 50% 1.000000 75% 2.000000 max 707.000000 Name: Quantidade, dtype: float64
def Classificao_Quantidade( Quantidade ):
'''
Agrupar a quantidade
'''
if int( Quantidade ) <= 5:
return '1-5 Avaliações'
elif int( Quantidade) <=10:
return '6-10 Avaliações'
elif int(Quantidade) <= 50:
return '11-50 Avaliações'
elif int(Quantidade) <= 100:
return '51-100 Avaliações'
else:
return '>101 Avaliações'
# Aplicação
Pizza = analise['Quantidade'].apply( Classificao_Quantidade ).value_counts( normalize=True )
# Tranformar em um DataFrame
Pizza = pd.DataFrame( Pizza ).reset_index()
# Plot
px.pie(
# DAdos
data_frame=Pizza,
# Paramewtros
names='index', values='Quantidade',
# Titulo
title='Divisão das Quantidades'
)
analise['Quantidade'][analise['Quantidade']>50].count()
626
# Eliminando livros com menos de 50 avaliações
analise.reset_index(inplace = True)
dados_cruzados = dados_cruzados.merge(analise[['Book-Title','Quantidade']],on = 'Book-Title')
dados_cruzados = dados_cruzados[dados_cruzados['Quantidade'] >= 50]
dados_cruzados.shape
(65477, 13)
# Publicação
analise_ano = dados_cruzados['Year-Of-Publication'].value_counts().sort_index().reset_index()
analise_ano.head()
| index | Year-Of-Publication | |
|---|---|---|
| 0 | 0.0 | 336 |
| 1 | 1920.0 | 1 |
| 2 | 1938.0 | 6 |
| 3 | 1943.0 | 3 |
| 4 | 1948.0 | 2 |
filtro = analise_ano.loc[(analise_ano['index'] > 1990) & (analise_ano['index'] < 2020)]
#Plot
plt.figure(figsize=(20,10))
plt.title('Analisando ano de publicação')
plt.bar(filtro['index'], filtro['Year-Of-Publication'])
<BarContainer object of 16 artists>
# Autores
dados_cruzados.groupby('Book-Author').agg(Quantidade=('Book-Rating','count'), Media=('Book-Rating','mean')).sort_values('Quantidade', ascending = False)
| Quantidade | Media | |
|---|---|---|
| Book-Author | ||
| Stephen King | 3326 | 7.787432 |
| John Grisham | 2379 | 7.526692 |
| James Patterson | 2010 | 7.710945 |
| J. K. Rowling | 1552 | 8.994845 |
| Janet Evanovich | 1225 | 7.968980 |
| ... | ... | ... |
| Irving John | 1 | 6.000000 |
| Patricia Springer | 1 | 8.000000 |
| Patricia Potter | 1 | 9.000000 |
| Patricia Highsmith | 1 | 6.000000 |
| John Harvey | 1 | 8.000000 |
482 rows × 2 columns
# Concentração de avaliações
dados_cruzados['Location'].value_counts(normalize = True).head(20)*100
USA 76.797960 CANADA 9.406356 , 2.304626 UNITED KINGDOM 2.177864 AUSTRALIA 1.376056 N/A 0.934679 GERMANY 0.884280 PORTUGAL 0.722391 MALAYSIA 0.661301 SPAIN 0.494830 NETHERLANDS 0.365014 NEW ZEALAND 0.282542 FRANCE 0.244361 ITALY 0.207707 SINGAPORE 0.171052 PHILIPPINES 0.157307 CALIFORNIA, 0.123708 SWITZERLAND 0.119126 AUSTRIA 0.116071 JAPAN 0.111490 Name: Location, dtype: float64
# Substituindo ruídos nos países
dados_cruzados['Location'].replace(',','', inplace = True)
dados_cruzados['Location'].replace('N/A','', inplace = True)
dados_cruzados['Location'].replace('CALIFORNIA,','USA', inplace = True)
# Idade
plt.figure(figsize=(20,10))
plt.title('Distribuição idades')
sns.boxplot(data = dados_cruzados,x= 'Age')
<AxesSubplot:title={'center':'Distribuição idades'}, xlabel='Age'>
# Serão eliminados dados de idades abaixo de 10 e acima de 100 anos
dados_cruzados = dados_cruzados.loc[(dados_cruzados['Age'] >= 10) & (dados_cruzados['Age'] < 100)]
# Gera a matriz
matriz = dados_cruzados.pivot_table(values = 'Book-Rating', index = 'Book-Title', columns = 'User-ID')
matriz.fillna(0, inplace = True)
matriz.head()
| User-ID | 42 | 51 | 99 | 114 | 125 | 165 | 183 | 185 | 242 | 254 | ... | 278755 | 278798 | 278800 | 278807 | 278824 | 278832 | 278836 | 278843 | 278844 | 278846 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Book-Title | |||||||||||||||||||||
| 1984 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1st to Die: A Novel | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2nd Chance | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 Blondes | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 84 Charing Cross Road | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 14965 columns
# Transformação para vetores
from scipy.sparse import csc_matrix
matriz_sparse = csc_matrix(matriz)
# Criar o modelo
from sklearn.neighbors import NearestNeighbors
modelo = NearestNeighbors(
# Quantidade de recomendações
n_neighbors = 5,
# Algoritmo
algorithm = 'brute',
)
# Fit modelo
modelo.fit(matriz_sparse)
NearestNeighbors(algorithm='brute')
# Recomendações
selecionar_livro = matriz.iloc[213,:].values.reshape(1, -1)
# Previsão do modelo
distancia, recomendacao = modelo.kneighbors(selecionar_livro)
# Ver as sugestões
for i in range(len(recomendacao)):
livro = matriz.index[recomendacao[i]]
print(livro)
Index(['Harry Potter and the Chamber of Secrets (Book 2)',
'Harry Potter and the Prisoner of Azkaban (Book 3)',
'Harry Potter and the Goblet of Fire (Book 4)',
'Harry Potter and the Sorcerer's Stone (Book 1)',
'The Shelters of Stone (Earth's Children Series, No 5)'],
dtype='object', name='Book-Title')
# lista de URL's capa livros recomendados
import PIL
import urllib
import requests
import matplotlib.image as mpimg
urls = []
imagens = []
for i in range(len(recomendacao[0])):
livro = matriz.index[recomendacao[0][i]]
link = books.loc[books['Book-Title'] == livro].head(1)['Image-URL-L'].values[0]
urls.append(link)
imagens.append(PIL.Image.open(urllib.request.urlopen(link)))
# Costruir relatorio
titulos = ['Seleção','Recomendação 1', 'Recomendação 2', 'Recomendação 3', 'Recomendação 4']
import plotly.graph_objects as Go
from plotly.subplots import make_subplots
figure = make_subplots(
rows = 1,
cols = 5,
subplot_titles = titulos
)
figure.update_layout(
height = 500,
width = 1200,
title_text = 'Sistema de Recomendação',
showlegend = False
)
# Plot imagens
col = 1
for i in imagens:
figure.add_trace(Go.Image(z = i), row = 1, col = col)
col = col+1
# Mostrar
figure.show()